Blogcatalog friendship network


In [1]:
import networkx as nx
import numpy as np
import pickle as p
from os import path
from scipy.sparse import csr_matrix, lil_matrix
from matplotlib import pyplot as plt
%matplotlib inline

data_loc = './../data/raw/BlogCatalog-dataset/data/'

BlogCatalog is the social blog directory which manages the bloggers and their blogs. There are 10,312 bloggers with unique ids starting from 1 to 10,312 and 333,983 friendship pairs in this dataset. Each blogger belongs to multiple groups. There are 39 groups with indices ranging from 1 to 39.

Load data from edge list


In [2]:
def maybe_load_data(data='./../data/blogcatalog.data'):
    if path.exists(data):
        print("Dataset is found. Skip reading...")
        with open(data, 'rb') as f:
            return p.load(f)
    else:
        iid = {}
        idx = 0
        edgelist = []

        # Read edges pairs
        with open(data_loc+'edges.csv', 'r') as f:
            for line in f.readlines():
                i, j = line.strip().split(',')  # csv
                if i not in iid:
                    iid[i] = idx; idx += 1
                if j not in iid:
                    iid[j] = idx; idx += 1
                edgelist.append((iid[i], iid[j]))

        # Create an nx undirected network
        bc = nx.Graph(edgelist)

        print("Number of nodes: ", len(bc))
        print("Number of edges: ", bc.size())
    
        # Read labels
        lil_labels = lil_matrix((len(bc), 39), dtype=int)
        # Read (node_id, label) file
        with open(data_loc+'group-edges.csv', 'r') as f:
            for line in f.readlines():
                node, group = line.strip().split(',') 
                lil_labels[iid[node], int(group) - 1] = 1  # range(0,39)
        
        # Pack data
        bc_dataset = {'NXGraph': bc, 'LILLabels': lil_labels}
        with open('./../data/blogcatalog.data', 'wb') as f:
            p.dump(bc_dataset, f)
        return bc_dataset

bc_dataset = maybe_load_data()


Dataset is found. Skip reading...

Load labels as a sparse matrix

In the original dataset, the group index is in range 1 to 39. For computational convenient, I will convert the group ids to range 0-38.


In [3]:
labels = bc_dataset['LILLabels']

Dump as edgelist

Graph with new ids is needed for other algorithms.


In [4]:
if not path.exists('./../data/blogcatalog.edges'):
    nx.write_edgelist(bc, path='./../data/blogcatalog.edges', data=False)
else:
    print("Edge list file is found. Skip writing...")


Edge list file is found. Skip writing...

Graph analysis

Degree distribution


In [5]:
b = bc_dataset['NXGraph']

In [6]:
degree_list = sorted([b.degree(i) for i in b], reverse=True)

Degree statistics:


In [7]:
print("Maximum degree: {}".format(degree_list[0]))
print("Minimum degree: {}".format(degree_list[-1]))
print("Mean of degree distribution: {}".format(np.mean(degree_list)))
print("Std variation of degree distribution: {}".format(np.std(degree_list)))


Maximum degree: 3992
Minimum degree: 1
Mean of degree distribution: 64.7756012412723
Std variation of degree distribution: 177.6973696826509

Degree distribution plot:


In [8]:
plt.subplot(111)
plt.semilogy(degree_list)
plt.title("BlogCatalog network degree distribution")
plt.ylabel("Log degree")
plt.xlabel("Nodes")
plt.legend(["Log-scale degree"])


Out[8]:
<matplotlib.legend.Legend at 0x7f7961b687f0>

Random walk plot


In [11]:
from sys import path
path.append('./../src/')
from walks import WalkGenerator
from constrains import R, UTriangle, UWedge

In [10]:
random_walker = WalkGenerator(graph=b, constrain=R())
bc_context = [i for i in random_walker(walk_length=80, num_walk=10)]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-48c20050c0d6> in <module>()
----> 1 random_walker = WalkGenerator(graph=b, constrain=R())
      2 bc_context = [i for i in random_walker(walk_length=80, num_walk=10)]

NameError: name 'WalkGenerator' is not defined

In [32]:
from collections import defaultdict as dd

random_walk_node_freqs = dd(int)

for node in bc_context[0]:  # random context
    random_walk_node_freqs[node] += 1

In [33]:
random_walk_node_hist = sorted(random_walk_node_freqs.values(), reverse=True)
plt.subplot(111)
plt.semilogy(random_walk_node_hist)
plt.title("Blogcatalog random walk node frequency distribution")
plt.ylabel("Log node count")
plt.xlabel("Nodes")


Out[33]:
<matplotlib.text.Text at 0x7f8e02777a58>

In [34]:
print("Maximum frequency: {}".format(random_walk_node_hist[0]))
print("Minimum frequency: {}".format(random_walk_node_hist[-1]))
print("Mean of frequency distribution: {}".format(np.mean(random_walk_node_hist)))
print("Std variation of frequency distribution: {}".format(np.std(random_walk_node_hist)))


Maximum frequency: 48922
Minimum frequency: 10
Mean of frequency distribution: 800.0
Std variation of frequency distribution: 2184.091607937247

In [39]:
with open('../data/blogcatalog.random_context', 'w') as f:
    for i in bc_context[0].reshape(-1, 80):
        f.write(' '.join(map(str, i)) + '\n')

Triangle motif walk plot


In [12]:
import time
triangle_walker = WalkGenerator(graph=b, constrain=UTriangle())

In [16]:
bc_triangle_context = [i for i in triangle_walker(walk_length=80, num_walk=10)]


Walking total of 103120 walks, will yield every 8249600 nodes...

In [17]:
from collections import defaultdict as dd

triangle_walk_node_freqs = dd(int)

for node in bc_triangle_context[0]:  # random context
    triangle_walk_node_freqs[node] += 1

In [18]:
triangle_walk_node_hist = sorted(triangle_walk_node_freqs.values(), reverse=True)
plt.subplot(111)
plt.semilogy(triangle_walk_node_hist)
plt.title("Blogcatalog triangle walk node frequency distribution")
plt.ylabel("Log node count")
plt.xlabel("Nodes")


Out[18]:
<matplotlib.text.Text at 0x7f795c7e8dd8>

In [20]:
with open('../data/blogcatalog.triangle_context', 'w') as f:
    for i in bc_triangle_context[0].reshape(-1, 80):
        f.write(' '.join(map(str, i)) + '\n')

In [ ]: